# import dependencies
import os
import numpy as np
import pandas as pd
import scanpy as sc
import loompy as lp
from MulticoreTSNE import MulticoreTSNE as TSNE
import json
import base64
import zlib
from pyscenic.plotting import plot_binarization
# set variables for file paths to read from and write to:
# set a working directory
wdir = "/ddn1/vol1/staging/leuven/stg_00002/lcb/cflerin/testruns/scenic-protocol/prottest3/"
os.chdir( wdir )
# path to loom output, generated from a combination of Scanpy and pySCENIC results:
f_final_loom = 'pbmc10k_scenic_integrated-output.loom'
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
sc.settings.set_figure_params(dpi=120)
# scenic output
lf = lp.connect( f_final_loom, mode='r', validate=False )
meta = json.loads(zlib.decompress(base64.b64decode( lf.attrs.MetaData )))
exprMat = pd.DataFrame( lf[:,:], index=lf.ra.Gene, columns=lf.ca.CellID).T
auc_mtx = pd.DataFrame( lf.ca.RegulonsAUC, index=lf.ca.CellID)
#regulons = lf.ra.Regulons
###
# create a dictionary of regulons:
regulons = {}
for i,r in pd.DataFrame(lf.ra.Regulons,index=lf.ra.Gene).iteritems():
regulons[i] = list(r[r==1].index.values)
cellAnnot = pd.concat(
[
pd.DataFrame( lf.ca.Celltype_Garnett, index=lf.ca.CellID ),
pd.DataFrame( lf.ca.ClusterID, index=lf.ca.CellID ),
pd.DataFrame( lf.ca.Louvain_clusters_Scanpy, index=lf.ca.CellID ),
pd.DataFrame( lf.ca.Percent_mito, index=lf.ca.CellID ),
pd.DataFrame( lf.ca.nGene, index=lf.ca.CellID ),
pd.DataFrame( lf.ca.nUMI, index=lf.ca.CellID ),
],
axis=1
)
cellAnnot.columns = [
'Celltype_Garnett',
'ClusterID',
'Louvain_clusters_Scanpy',
'Percent_mito',
'nGene',
'nUMI']
lf.close()
from pyscenic.rss import regulon_specificity_scores
from pyscenic.plotting import plot_rss
import matplotlib.pyplot as plt
from adjustText import adjust_text
import seaborn as sns
from pyscenic.binarization import binarize
rss_cellType = regulon_specificity_scores( auc_mtx, cellAnnot['Celltype_Garnett'] )
rss_cellType
cats = sorted(list(set(cellAnnot['Celltype_Garnett'])))
fig = plt.figure(figsize=(15, 8))
for c,num in zip(cats, range(1,len(cats)+1)):
x=rss_cellType.T[c]
ax = fig.add_subplot(2,5,num)
plot_rss(rss_cellType, c, top_n=5, max_n=None, ax=ax)
ax.set_ylim( x.min()-(x.max()-x.min())*0.05 , x.max()+(x.max()-x.min())*0.05 )
for t in ax.texts:
t.set_fontsize(12)
ax.set_ylabel('')
ax.set_xlabel('')
adjust_text(ax.texts, autoalign='xy', ha='right', arrowprops=dict(arrowstyle='-',color='lightgrey'), precision=0.001 )
fig.text(0.5, 0.0, 'Regulon', ha='center', va='center', size='x-large')
fig.text(0.00, 0.5, 'Regulon specificity score (RSS)', ha='center', va='center', rotation='vertical', size='x-large')
plt.tight_layout()
plt.rcParams.update({
'figure.autolayout': True,
'figure.titlesize': 'large' ,
#'axes.labelsize': 'x-large',
#'axes.titlesize':'x-large',
'xtick.labelsize':'large',
'ytick.labelsize':'large'
})
plt.savefig("PBMC10k_cellType-RSS-top5.png", dpi=150, bbox_inches = "tight")
plt.show()
topreg = []
for i,c in enumerate(cats):
topreg.extend(
list(rss_cellType.T[c].sort_values(ascending=False)[:5].index)
)
topreg = list(set(topreg))
Aggregate the AUC matrix by cell type:
auc_mtx_celltype = auc_mtx.groupby(cellAnnot['Celltype_Garnett'],axis=0).mean()
auc_mtx_celltype
fig, ax = plt.subplots(1, 1, figsize = (8,8), dpi=300)
sns.heatmap(auc_mtx_celltype[topreg], ax=ax, annot=True, fmt=".2f", linewidths=.7, cbar=False, square=True, linecolor='gray',
cmap="YlGnBu", annot_kws={"size": 5})
ax.tick_params(axis='both', which='major', pad=-2)
sns.set(font_scale=0.7)
ax.set_ylabel('')
ax.set_xlabel('')
plt.tight_layout()
plt.savefig("PBMC10k_cellType-AUC-heatmap-top5.png", dpi=150, bbox_inches = "tight")
plt.show()
g = sns.clustermap(auc_mtx_celltype[topreg], annot=True, fmt=".2f", linewidths=.7, cbar=False, square=False, linecolor='gray',
cmap="YlGnBu", annot_kws={"size": 6}, figsize=(12,5) )
sns.set(font_scale=0.7)
g.cax.set_visible(False)
g.ax_heatmap.set_ylabel('')
g.ax_heatmap.set_xlabel('')
binary_mtx, auc_thresholds = binarize( auc_mtx, num_workers=25 )
binary_mtx.head()
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 4), dpi=150)
plot_binarization(auc_mtx, 'RXRA_(+)', auc_thresholds['RXRA_(+)'], ax=ax1)
plot_binarization(auc_mtx, 'NFE2_(+)', auc_thresholds['NFE2_(+)'], ax=ax2)
plot_binarization(auc_mtx, 'ETV2_(+)', auc_thresholds['ETV2_(+)'], ax=ax3)
ax1.set_ylabel('')
ax2.set_ylabel('')
ax3.set_ylabel('')
fig.text(0.00, 0.5, 'Frequency', ha='center', va='center', rotation='vertical', size='x-large')
fig.tight_layout()
fig.savefig("PBMC10k_cellType-binaryPlot.png", dpi=150, bbox_inches = "tight")
#plt.show()
rss_louvain = regulon_specificity_scores( auc_mtx, cellAnnot['Louvain_clusters_Scanpy'] )
rss_louvain
cats = sorted( list(set(cellAnnot['Louvain_clusters_Scanpy'])), key=int )
fig = plt.figure(figsize=(15, 12))
for c,num in zip(cats, range(1,len(cats)+1)):
x=rss_louvain.T[c]
ax = fig.add_subplot(3,5,num)
plot_rss(rss_louvain, c, top_n=5, max_n=None, ax=ax)
ax.set_ylim( x.min()-(x.max()-x.min())*0.05 , x.max()+(x.max()-x.min())*0.05 )
for t in ax.texts:
t.set_fontsize(12)
ax.set_ylabel('')
ax.set_xlabel('')
adjust_text(ax.texts, autoalign='xy', ha='right', arrowprops=dict(arrowstyle='-',color='lightgrey'), precision=0.001 )
fig.text(0.5, 0.0, 'Regulon', ha='center', va='center', size='x-large')
fig.text(0.00, 0.5, 'Regulon specificity score (RSS)', ha='center', va='center', rotation='vertical', size='x-large')
plt.tight_layout()
plt.rcParams.update({
'figure.autolayout': True,
'figure.titlesize': 'large' ,
#'axes.labelsize': 'x-large',
#'axes.titlesize':'x-large',
'xtick.labelsize':'large',
'ytick.labelsize':'large'
})
plt.savefig("PBMC10k_Louvain-RSS-top5.png", dpi=150, bbox_inches = "tight")
plt.show()
topreg = []
for i,c in enumerate(cats):
topreg.extend(
list(rss_louvain.T[c].sort_values(ascending=False)[:5].index)
)
topreg = list(set(topreg))
Aggregate the AUC matrix by cell type:
auc_mtx_louvain = auc_mtx.groupby(cellAnnot['Louvain_clusters_Scanpy'],axis=0).mean()
auc_mtx_louvain
fig, ax = plt.subplots(1, 1, figsize = (8,12), dpi=300)
sns.heatmap(auc_mtx_louvain[topreg], ax=ax, annot=True, fmt=".2f", linewidths=.7, cbar=False, square=True, linecolor='gray',
cmap="YlGnBu", annot_kws={"size": 5})
ax.tick_params(axis='both', which='major', pad=-2)
sns.set(font_scale=0.7)
ax.set_ylabel('')
ax.set_xlabel('')
plt.tight_layout()
plt.savefig("PBMC10k_Louvain-AUC-heatmap-top5.png", dpi=150, bbox_inches = "tight")
plt.show()
g = sns.clustermap(auc_mtx_louvain[topreg], annot=True, fmt=".2f", linewidths=.7, cbar=False, square=False, linecolor='gray',
cmap="YlGnBu", annot_kws={"size": 6}, figsize=(14,7) )
sns.set(font_scale=0.7)
g.cax.set_visible(False)
g.ax_heatmap.set_ylabel('')
g.ax_heatmap.set_xlabel('')
adjacencies = pd.read_csv("adj.tsv", index_col=False, sep='\t')
Create the modules:
from pyscenic.utils import modules_from_adjacencies
modules = list(modules_from_adjacencies(adjacencies, exprMat))
tf = 'EBF1'
tf_mods = [ x for x in modules if x.transcription_factor==tf ]
for i,mod in enumerate( tf_mods ):
print( f'{tf} module {str(i)}: {len(mod.genes)} genes' )
print( f'{tf} regulon: {len(regulons[tf+"_(+)"])} genes' )
write these modules, and the regulon to files:
for i,mod in enumerate( tf_mods ):
with open( tf+'_module_'+str(i)+'.txt', 'w') as f:
for item in mod.genes:
f.write("%s\n" % item)
with open( tf+'_regulon.txt', 'w') as f:
for item in regulons[tf+'_(+)']:
f.write("%s\n" % item)
from IPython.display import display, Image
display(Image(filename='iRegulon_screenshot_PBMC10k-EBF1.png'))
Tracks will be available soon on the cisTarget database download page